crs = 'EPSG:26913'
library(stringr)
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(gganimate)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(FNN)
library(spdep)
library(caret)
library(ckanr)
library(grid)
library(gridExtra)
library(knitr)
library(kableExtra)
library(tidycensus)
library(scales)
library(stargazer)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(mapview)
library(ggmap)
library(jsonlite)
library(magick)
library(magrittr)
library(janitor)
root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
windowsFonts(font = windowsFont('Helvetica'))
ll <- function(dat, proj4 = 4326){st_transform(dat, proj4)}
plotTheme <- function(base_size = 9, title_size = 10){
theme(
text = element_text(family = 'font', color = "black"),
plot.title = element_text(family = 'font',
size = title_size, colour = "black", hjust = 0.5),
plot.subtitle = element_text(family = 'font', face = 'italic',
size = base_size, colour = "black", hjust = 0.5),
plot.caption = element_text(family = 'font', hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.01),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=.5),
strip.background = element_blank(),
strip.text = element_text(family = 'font', size=9),
axis.title = element_text(family = 'font', size=9),
axis.text = element_text(family = 'font', size=7),
axis.text.y = element_text(family = 'font', size=7),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(family = 'font', colour = "black", face = "italic", size = 9),
legend.text = element_text(family = 'font', colour = "black", face = "italic", size = 7),
strip.text.x = element_text(family = 'font', size = 9),
legend.key.size = unit(.5, 'line')
)
}
mapTheme <- function(base_size = 9, title_size = 10){
theme(
text = element_text(family = 'font', color = "black"),
plot.title = element_text(family = 'font',
size = title_size, colour = "black", hjust = 0.5),
plot.subtitle = element_text(family = 'font', face = 'italic',
size = base_size, colour = "black", hjust = 0.5),
plot.caption = element_text(family = 'font', hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
strip.text = element_text(size=base_size),
axis.title = element_text(family = 'font', size=9),
axis.text = element_blank(),
axis.text.y = element_blank(),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(family = 'font', colour = "black", face = "italic", size = 9),
legend.text = element_text(family = 'font', colour = "black", face = "italic", size = 7),
strip.text.x = element_text(size=base_size),
legend.key.size = unit(.5, 'line')
)
}
brightRed = '#f2727f'
darkBlue = '#315d7f'
palette5 <- c("#f9b294","#f2727f","#c06c86","#6d5c7e","#315d7f")
palette4 <- c("#f9b294","#f2727f","#c06c86","#6d5c7e")
palette2 <- c("#f9b294","#f2727f")
palette1_main <- "#F2727F"
palette1_assist <- '#F9B294'
A NJ Transit train disabled by a fire at the Princeton Junction in West Windsor in this 2018 file photo.Ed Murray | NJ Advance Media for
However, sadly, NJ Transit trains broke down more than any other system’s in the nation in 2019, ranking the worst in the nation, according to Federal Transit Administration data. Train Delay will not only cause the convenience to people’s lives but also cause the financial and social lost to the society. Under this circumstance, for NJ Transit’s administration, how to effectively predict and manage the train delay is vital.
Our model’s use case is based on the NJ Transit company. A model to predict delays ahead of schedule by 2 hours will be made toward NJ Transit. These real time predictions will inform the DELAY prediction to the administrator of NJ Transit in this way to help them better dispatch and manage the train resources (for example, redeploy trains and temporarily accelerate the train speed) so that potential long delays could be avoided. Meanwhile, the long-term model results help identifying problems within the transit network.
# County Geographical data
njCounties =
tigris::counties(state = "34")%>%
st_transform(crs)%>%
dplyr::select(GEOID,NAMELSAD,geometry)
# NJ Transit station data
njStations = st_read('https://opendata.arcgis.com/datasets/4809dada94c542e0beff00600ee930f6_0.geojson') %>%
st_transform(crs) %>%
dplyr::select(station = STATION_ID, geometry) %>%
# Trying to standardize the station names...
mutate(station = tolower(station)) %>%
mutate(station =
case_when(
# str_detect(station, '-') ~
# substr(station, 1, str_locate(station, '-')[[1]] - 1),
substr(station, nchar(station) - 15, nchar(station)) == ' railway station'~
substr(station, 1, nchar(station) - 16),
substr(station, nchar(station) - 12, nchar(station)) == ' rail station'~
substr(station, 1, nchar(station) - 13),
substr(station, nchar(station) - 7, nchar(station)) == ' station' ~
substr(station, 1, nchar(station) - 8),
substr(station, nchar(station) - 8, nchar(station)) == ' terminal' ~
substr(station, 1, nchar(station) - 9),
TRUE ~ station
)) %>%
mutate(joined = 1)
# Further modifying station names
njStations[12, 1] = 'middletown nj'
njStations[149, 1] = 'middletown ny'
njStations[131, 1] = 'anderson street'
njStations[99, 1] = 'atlantic city rail'
njStations[87, 1] = 'bay street'
njStations[134, 1] = 'wood ridge'
njStations[90, 1] = 'watsessing avenue'
njStations[133, 1] = 'teterboro'
njStations[159, 1] = 'secaucus upper lvl'
njStations[166, 1] = 'secaucus lower lvl'
njStations[102, 1] = 'ramsey main st'
njStations[156, 1] = 'ramsey route 17'
njStations[102, 1] = 'ramsey main st'
njStations[115, 1] = 'radburn fair lawn'
njStations[140, 1] = 'princeton junction'
njStations[92, 1] = 'philadelphia'
njStations[165, 1] = 'pennsauken'
njStations[80, 1] = 'mountain view'
njStations[163, 1] = 'mount arlington'
njStations[158, 1] = 'montclair state u'
njStations[107, 1] = 'glen rock main line'
njStations[114, 1] = 'glen rock boro hall'
njStations[116, 1] = 'broadway fair lawn'
njStations[132, 1] = 'essex street'
# Line
lines = rbind(
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Northeast Corrdr') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/6/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Atl. City Line') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/7/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Bergen Co. Line ') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/9/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Gladstone Branch') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/4/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Main Line') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/8/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Montclair-Boonton') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/3/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Morristown Line') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/10/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'No Jersey Coast') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/12/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Pascack Valley') %>%
dplyr::select(line),
st_read('https://services6.arcgis.com/M0t0HPE53pFK525U/arcgis/rest/services/NJTransit_Rail_Linea/FeatureServer/11/query?outFields=*&where=1%3D1&f=geojson') %>%
st_transform(crs) %>%
mutate(line = 'Raritan Valley') %>%
dplyr::select(line)
)
base_map <- get_map(location = unname(st_bbox(ll(st_buffer(st_union(njCounties),11000)))),maptype = "terrian")
ggmap(base_map) +
geom_sf(data=ll(st_union(njCounties)),color="black",size=1.2,fill = "transparent",inherit.aes = FALSE)+
geom_sf(data=ll(njCounties),color="black",size=0.5,linetype ="dashed",fill = "transparent",inherit.aes = FALSE)+
geom_sf(data=ll(lines),size=1.5,color=palette1_main,inherit.aes = FALSE)+
labs(title = "Map of NJ Transit Railway Stations",
subtitle = "Red dots are stations",
x="",y="")+
mapTheme()
In the above graph, we can see the geographical distribution of NJ Transit stations are mainly in the north-east of New Jersey, close to New York.
Data of NJ Transit Train Delay in September and October of 2019 are used. For further operation, stations’ names are standardized. In this section, several exploratory analysis will be done to explained concepts, phenomenons, and patterns.
# 2019.9
trainsSep = read.csv('2019_09.csv') %>%
filter(., type == 'NJ Transit' & status == 'departed' &
line != 'Princeton Shuttle') %>%
mutate(date = ymd(date),
scheduled_time = ymd_hms(scheduled_time),
actual_time = ymd_hms(actual_time)) %>%
dplyr::select(-status,-type)
# 2019.10
trainsOct = read.csv('2019_10.csv') %>%
filter(., type == 'NJ Transit' & status == 'departed' &
line != 'Princeton Shuttle') %>%
mutate(date = ymd(date),
scheduled_time = ymd_hms(scheduled_time),
actual_time = ymd_hms(actual_time)) %>%
arrange(stop_sequence) %>%
dplyr::select(-status,-type)
# Bind the two-month data
trains = rbind(trainsSep, trainsOct) %>%
# Trying to standardize station names
mutate(to = tolower(to),
from = tolower(from))
# Standardize names
trainsWithGeometry = trains %>%
filter(., to != 'secaucus concourse') %>%
mutate(to =
case_when(
str_detect(to, '-') ~
substr(to, 1, str_locate(to, '-')[[1]] - 1),
substr(to, nchar(to) - 15, nchar(to)) == ' railway station'~
substr(to, 1, nchar(to) - 16),
substr(to, nchar(to) - 12, nchar(to)) == ' rail station'~
substr(to, 1, nchar(to) - 13),
substr(to, nchar(to) - 7, nchar(to)) == ' station' ~
substr(to, 1, nchar(to) - 8),
substr(to, nchar(to) - 8, nchar(to)) == ' terminal' ~
substr(to, 1, nchar(to) - 9),
TRUE ~ to
),
from =
case_when(
str_detect(from, '-') ~
substr(from, 1, str_locate(from, '-')[[1]] - 1),
substr(from, nchar(from) - 15, nchar(from)) == ' railway station'~
substr(from, 1, nchar(from) - 16),
substr(from, nchar(from) - 12, nchar(from)) == ' rail station'~
substr(from, 1, nchar(from) - 13),
substr(from, nchar(from) - 7, nchar(from)) == ' station' ~
substr(from, 1, nchar(from) - 8),
substr(from, nchar(from) - 8, nchar(from)) == ' terminal' ~
substr(from, 1, nchar(from) - 9),TRUE ~ from
)) %>%
left_join(njStations, by = c('to' = 'station')) %>%
st_sf() %>%
dplyr::select(-joined)
When the data set of NJ Transit is firstly obtained, the exact characters of field “TrainID” is confusing. To model the prediction, we need to understand the basic unit of rail transportation, TrainID. The following section will emphasize on that to clarify “TrainID”.
# A trainID is for a whole trip
trainIDforWholeTrip = trains%>%
filter(train_id == 7824, date == "2019-10-06")%>%
arrange(date, train_id)%>%
dplyr::select(train_id,stop_sequence,from,to,line)
trainIDforWholeTrip = trainIDforWholeTrip[1:5,]
# Different TrainIDs is within the same day & Express or normal lines
difTrainIDs = trains%>%
filter(line=="Northeast Corrdr",date=="2019-10-06",stop_sequence=="2")%>%
arrange(date,train_id)%>%
dplyr::select(scheduled_time,train_id,from,to,line)
difTrainIDs = difTrainIDs[1:5,]
# A TrainID is allocated for the specific time slot train everyday or more
TrainIDforDifDay = trains%>%
filter(line=="Northeast Corrdr",train_id==7824,stop_sequence=="2")%>%
arrange(date,to)%>%
dplyr::select(scheduled_time,train_id,from,to,line)
TrainIDforDifDay = TrainIDforDifDay[1:5,]
| train_id | stop_sequence | from | to | line |
|---|---|---|---|---|
| 7824 | 1 | trenton | trenton | Northeast Corrdr |
| 7824 | 2 | trenton | hamilton | Northeast Corrdr |
| 7824 | 3 | hamilton | princeton junction | Northeast Corrdr |
| 7824 | 4 | princeton junction | new brunswick | Northeast Corrdr |
| 7824 | 5 | new brunswick | edison | Northeast Corrdr |
From the first table, we can know that a trainID is costatnt for a whole trip, from first station to the last station of the line.
| scheduled_time | train_id | from | to | line |
|---|---|---|---|---|
| 2019-10-07 01:06:00 | 3800 | trenton | hamilton | Northeast Corrdr |
| 2019-10-07 01:31:00 | 3805 | new york penn station | secaucus upper lvl | Northeast Corrdr |
| 2019-10-06 03:54:00 | 3806 | trenton | hamilton | Northeast Corrdr |
| 2019-10-06 05:03:00 | 7804 | trenton | hamilton | Northeast Corrdr |
| 2019-10-06 06:03:00 | 7808 | trenton | hamilton | Northeast Corrdr |
From this table, we can know that for a same transit line, different trainIDs will be used within the same day. And even for the same trainsit line, there are several sublines, which have various initial station, terminal station, and routes.
| scheduled_time | train_id | from | to | line |
|---|---|---|---|---|
| 2019-09-01 10:05:00 | 7824 | trenton | hamilton | Northeast Corrdr |
| 2019-09-02 10:05:00 | 7824 | trenton | hamilton | Northeast Corrdr |
| 2019-09-07 10:05:00 | 7824 | trenton | hamilton | Northeast Corrdr |
| 2019-09-08 10:05:00 | 7824 | trenton | hamilton | Northeast Corrdr |
| 2019-09-14 10:05:00 | 7824 | trenton | hamilton | Northeast Corrdr |
From this table, we can know that a TrainID is allocated for the trains which run at specific same scheduled time everyday with the same route.
Besides, based on above insights, Should we divide the sublines of every transit line? In theory, detailed sublines may make observation more accurate and prediction more accurate. However, following graph displays the convoluted of the division. About 59 sublines will be needed to distinguish the sublines. That will bring unescessary redudancy.
# Distinguish Express and normal lines
trains_analysis_subline = trains%>%
filter(from != to)%>%
filter(from !="secaucus concourse" & to!="secaucus upper lvl")%>%
filter(from !="east orange" & to!="brick church")%>%
filter(from !="new york penn station" & to!="maplewood")
lineList = unique(trains_analysis_subline$line)
for (k in lineList) {
LineTool = trains_analysis_subline%>%
filter(line==k,stop_sequence=="2")%>%
arrange(date,train_id)%>%
group_by(from,to,train_id)%>%
summarise()
listFrom = unique(LineTool$from)
listTo = unique(LineTool$to)
n = 1
for (i in listFrom){
for (j in listTo) {
EvLine = filter(LineTool,from==i & to==j)
listTrainID = unique(EvLine$train_id)
if (length(listTrainID>0)){
trains_analysis_subline = trains_analysis_subline%>%mutate(line = case_when(train_id %in% listTrainID ~ paste(line,n, sep = "_subline_"),TRUE ~ line))
n = n+1}}}}
trains_analysis_subline <- as.data.frame(unique(trains_analysis_subline$line))%>%
rename("sublineList" = "unique(trains_analysis_subline$line)")%>%
arrange(sublineList)
i=1
a=1
tableForSublines <- data.frame(matrix(NA,nrow=10))
tmp1 <- trains_analysis_subline[c(1:3),]
tmp2 <- trains_analysis_subline[c(4:9),]
tmp3 <- trains_analysis_subline[c(10:14),]
tmp4 <- trains_analysis_subline[c(15:19),]
tmp5 <- trains_analysis_subline[20,]
tmp6 <- trains_analysis_subline[c(21:27),]
tmp7 <- trains_analysis_subline[c(28:36),]
tmp8 <- trains_analysis_subline[c(37:43),]
tmp9 <- trains_analysis_subline[c(44:50),]
tmp10 <- trains_analysis_subline[c(51:53),]
tmp11 <- trains_analysis_subline[c(54:59),]
length(tmp1) = length(c(1:10))
length(tmp2) = length(c(1:10))
length(tmp3) = length(c(1:10))
length(tmp4) = length(c(1:10))
length(tmp5) = length(c(1:10))
length(tmp6) = length(c(1:10))
length(tmp7) = length(c(1:10))
length(tmp8) = length(c(1:10))
length(tmp9) = length(c(1:10))
length(tmp10) = length(c(1:10))
length(tmp11) = length(c(1:10))
tableForSublines <- cbind(tableForSublines, tmp5)
tableForSublines <- cbind(tableForSublines, tmp10)
tableForSublines <- cbind(tableForSublines, tmp1)
tableForSublines <- cbind(tableForSublines, tmp3)
tableForSublines <- cbind(tableForSublines, tmp4)
tableForSublines <- cbind(tableForSublines, tmp2)
tableForSublines <- cbind(tableForSublines, tmp11)
tableForSublines <- cbind(tableForSublines, tmp6)
tableForSublines <- cbind(tableForSublines, tmp8)
tableForSublines <- cbind(tableForSublines, tmp9)
tableForSublines <- cbind(tableForSublines, tmp7)
tableForSublines = tableForSublines%>%row_to_names(row_number = 1)
tableForSublines = tableForSublines[1:5,7:12]
tableForSublines[is.na(tableForSublines)] = ""
kable(tableForSublines,align = 'c',caption = '<center>Differnet sublines for NJ Transit</center>',table.attr = "style='width:75%;'")%>%
kable_classic(full_width = T)%>%
kable_styling(position = "center")%>%
row_spec(0, bold = T,color = palette1_main)
| Bergen Co. Line | Raritan Valley | Montclair-Boonton | No Jersey Coast | Northeast Corrdr | Morristown Line | |
|---|---|---|---|---|---|---|
| 2 | Bergen Co. Line _subline_1 | Raritan Valley_subline_1 | Montclair-Boonton_subline_1 | No Jersey Coast_subline_1 | Northeast Corrdr_subline_1 | Morristown Line_subline_1 |
| 3 | Bergen Co. Line _subline_2 | Raritan Valley_subline_2 | Montclair-Boonton_subline_2 | No Jersey Coast_subline_2 | Northeast Corrdr_subline_2 | Morristown Line_subline_2 |
| 4 | Bergen Co. Line _subline_3 | Raritan Valley_subline_3 | Montclair-Boonton_subline_3 | No Jersey Coast_subline_3 | Northeast Corrdr_subline_3 | Morristown Line_subline_3 |
| 5 | Bergen Co. Line _subline_4 | Raritan Valley_subline_4 | Montclair-Boonton_subline_4 | No Jersey Coast_subline_4 | Northeast Corrdr_subline_4 | Morristown Line_subline_4 |
| 6 | Bergen Co. Line _subline_5 | Raritan Valley_subline_5 | Montclair-Boonton_subline_5 | No Jersey Coast_subline_5 | Northeast Corrdr_subline_5 | Morristown Line_subline_5 |
In the following, average delay time for different train lines and different train stations within the same line are respectively plot to analyze the pattern of it. That will give instructions on how to select predictors and models for the delay time…
# Average delay time for different train lines
plot1 <- trainsWithGeometry%>%
st_drop_geometry()%>%
dplyr::select(delay_minutes,line)%>%
na.omit()%>%
group_by(line)%>%
summarise(averageDelay=mean(delay_minutes))%>%
ggplot(aes(line,averageDelay))+
geom_bar(fill=palette1_main,position ="dodge", stat ="summary", fun ="mean")+
labs(title = "",x="Lines",y="Average Delay")+
plotTheme(5,10)+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1),
panel.spacing = unit(2, 'lines'),
panel.border = element_rect(colour = "black", fill=NA, size=.5))
plot2 <- trainsWithGeometry%>%
dplyr::select(to,delay_minutes,line)%>%
na.omit()%>%
group_by(line)%>%
summarise(averageDelay=mean(delay_minutes))%>%
st_drop_geometry()
byLine2 = lines %>%
left_join(plot2, by = 'line')%>%
ggplot()+
geom_sf(data=st_union(njCounties),color="black",size=1,fill = "transparent")+
geom_sf(data=njCounties,color="black",size=0.5,linetype ="dashed",fill = "transparent")+
geom_sf(aes(color=averageDelay),alpha=0.7,size=.75)+
scale_color_gradient(high = brightRed, low = darkBlue, name = '') +
labs(title = "",x="Stations",y="")+
mapTheme(5,10)+
theme(legend.position = "none")
grid.arrange(plot1, byLine2,ncol=2,top = paste("All Lines"," Spatial Delay Distribution",sep = ""))
# Average delay time for different train stations within the same line
linelist <- unique(trainsWithGeometry$line)
for (i in linelist){
plot1 <- trainsWithGeometry%>%
st_drop_geometry()%>%
dplyr::select(to,delay_minutes,line)%>%
na.omit()%>%
group_by(to,line)%>%
summarise(averageDelay=mean(delay_minutes))%>%
filter(line == i)%>%
ggplot(aes(to,averageDelay))+
geom_bar(fill=palette1_main,position ="dodge", stat ="summary", fun ="mean")+
labs(title = "",x="Stations",y="Average Delay")+
plotTheme(5,10)+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1),
panel.spacing = unit(2, 'lines'),
panel.border = element_rect(colour = "black", fill=NA, size=.5))
plot2 <- trainsWithGeometry%>%
dplyr::select(to,delay_minutes,line)%>%
na.omit()%>%
group_by(to,line)%>%
summarise(averageDelay=mean(delay_minutes))%>%
filter(line == i)%>%
ggplot()+
geom_sf(data=st_union(njCounties),color="black",size=1,fill = "transparent")+
geom_sf(data=njCounties,color="black",size=0.5,linetype ="dashed",fill = "transparent")+
geom_sf(data=njStations,size=1,color=palette1_assist,alpha=0.5)+
geom_sf(aes(size=averageDelay),color=palette1_main,alpha=0.7)+
scale_size_continuous(range = c(1, 3))+
labs(title = "",x="Stations",y="")+
mapTheme(5,10)+
theme(legend.position = "none")
g <- arrangeGrob(plot1, plot2,ncol=2,top = paste(i," Spatial Delay Distribution",sep = ""))
ggsave(path="../geo",paste(i,"geo.png",sep = ""),g)}
# list.files(path="./geo",pattern = '*.png', full.names = TRUE) %>%
# image_read() %>%
# image_join() %>%
# image_animate(fps=1) %>%
# image_write("./geo/allLines.gif")
The previous graphs reveals different average delays across lines and stations, suggesting that the line and station are important factors to predict delays. Especially, we find that the Atlantic City Line, which is detached from the main network, is an outlier with an average delay length four times compared to the rest…
In the following, delay time, in the scale of time, for different train lines and different train stations within the same line are respectively plot to analyze the periodic pattern of it. That will give instructions on how to select predictors and models for the delay time…
monday <-trainsWithGeometry%>%
st_drop_geometry()%>%
filter(line=="No Jersey Coast")%>%
na.omit()%>%
mutate(interval_hour = ymd_h(substr(scheduled_time, 1, 13)),
dayotw = wday(scheduled_time),
week = week(scheduled_time),
hour = hour(scheduled_time))%>%
arrange(dayotw,interval_hour)%>%
filter(week%in%(36:43))%>%
mutate(monday = ifelse(dayotw == 1 & hour == 0,interval_hour, 0)) %>%
filter(monday != 0)
linelist <- unique(trainsWithGeometry$line)
for (i in linelist){
plot1 <- trainsWithGeometry%>%
st_drop_geometry()%>%
filter(line==i)%>%
na.omit()%>%
mutate(interval_hour = ymd_h(substr(scheduled_time, 1, 13)),
week = week(scheduled_time))%>%
filter(week%in%(36:43))%>%
group_by(interval_hour,line,date)%>%
summarise(AverageDelayInHour = mean(delay_minutes))%>%
ggplot(aes(interval_hour,AverageDelayInHour)) +
geom_line(color=palette1_main,size=1)+
geom_vline(data = monday, aes(xintercept = interval_hour),linetype = "dashed",size=1)+
labs(title = paste("Line:",i,sep = ""),x='',y="Average Delay by Hour")+
plotTheme(10,20)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=3))
stationlist <- trainsWithGeometry%>%filter(line==i)
stationlist <- unique(stationlist$to)
for (j in stationlist){
plot2 <- trainsWithGeometry%>%
st_drop_geometry()%>%
filter(line==i & to==j)%>%
na.omit()%>%
mutate(interval_hour = ymd_h(substr(scheduled_time, 1, 13)),
week = week(scheduled_time))%>%
filter(week%in%(36:43))%>%
group_by(interval_hour,date)%>%
summarise(AverageDelayInHour = mean(delay_minutes))%>%
ggplot(aes(interval_hour,AverageDelayInHour)) +
geom_line(color=palette1_main,size=1)+
geom_vline(data = monday, aes(xintercept = interval_hour),linetype = "dashed",size=1)+
labs(title = paste("Station:",j,sep = ""),x="Interval Hour",y="Average Delay by Hour")+
plotTheme(10,20)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=3))
g <- arrangeGrob(plot1, plot2,ncol=1)
ggsave(path="../temporal",paste(i,j,"_temporal.png",sep = "_"),g)}}
# list.files(path="../temporal/select",pattern = '*.png', full.names = TRUE) %>%
# image_read() %>% # reads each path file
# image_join() %>% # joins image
# image_animate(fps=1) %>% # animates, can opt for number of loops
# image_write("../temporal/select") # write to current dir
The above graphs show delays follow periodic cycles of hours and days, suggesting that the hour and day-of-week fixed-effects should be employed in the model. On the other hand, the temporal patterns vary across different stations and lines. Therefore, delay time-lags of the line and station are also included in our model.
Firstly, features related to transit operations are added into the models. This engineering mainly comes from THREE parts. Station, line and train.
### Delays on this line two hours prior
# Add a field: time of prediction
trains = trains %>%
mutate(timeOfPrediction = scheduled_time - hours(2))
timeLagBase =
trains %>%
dplyr::select(date, line, actual_time, delay_minutes)
findLagDelay = function(timeOfPrediction, line, date){
narrowDown =
timeLagBase[timeLagBase$line == line &
timeLagBase$date == date &
difftime(timeOfPrediction,
timeLagBase$actual_time,
units = 'secs') > 0,] %>%
arrange(desc(actual_time))
return(narrowDown[1, 4])
}
# Add lag delay data
for(i in 1:nrow(trains)){
date = trains[i, 'date']
line = trains[i, 'line']
timeOfPrediction = trains[i, 'timeOfPrediction']
lagDelay = findLagDelay(timeOfPrediction, line, date)
trains[i, 'lagDelay'] = lagDelay
if(i %% 1000 == 0){
print(i)
}
}
### Station-based average delay (2-hour lag)
trains = trains %>% filter(., !is.na(stop_sequence))
stationAvgDelayBase = trains %>%
dplyr::select(station = to, actual_time, delay_minutes) %>%
mutate(actual_time = ymd_hms(actual_time))
findAvgStationDelay = function(timeOfPrediction, depStation){
narrowDown =
stationAvgDelayBase %>%
filter(., station == depStation &
actual_time %within% interval(timeOfPrediction - hours(1), timeOfPrediction))
return(mean(narrowDown$delay_minutes, na.rm = T))
}
# Add station lag delay data
for(i in 1:nrow(trains)){
depStation = trains[i, 'to']
timeOfPrediction = trains[i, 'timeOfPrediction']
stationDelay = findAvgStationDelay(timeOfPrediction, depStation)
trains[i, 'stationDelay'] = stationDelay
if(i %% 1000 == 0){
print(i)
}
}
### Delay station average 2 hours lag 1 station
# Add data
for(i in 1:nrow(trains)){
depStation = trains[i, 'from']
timeOfPrediction = trains[i, 'timeOfPrediction']
lag1StationDelay = findAvgStationDelay(timeOfPrediction, depStation)
trains[i, 'lag1StationDelay'] = lag1StationDelay
if(i %% 1000 == 0){
print(i)
}
}
### Delay station average 2 hours lag 2 stations
lastStation =
trains %>%
dplyr::select(date, train = train_id, sequence = stop_sequence, lag2Station = from) %>%
mutate(sequencePlus = sequence + 1) %>%
dplyr::select(-sequence)
trains =
left_join(trains, lastStation, by = c('date' = 'date',
'train_id' = 'train',
'stop_sequence' = 'sequencePlus')) %>%
mutate(lag2Station = ifelse(is.na(lag2Station), from, lag2Station))
# Add data
for(i in 1:nrow(trains)){
depStation = trains[i, 'lag2Station']
timeOfPrediction = trains[i, 'timeOfPrediction']
lag2StationDelay = findAvgStationDelay(timeOfPrediction, depStation)
trains[i, 'lag2StationDelay'] = lag2StationDelay
if(i %% 1000 == 0){
print(i)
}
}
### Lag one day
trains =
trains %>%
group_by(train_id, to) %>%
arrange(scheduled_time) %>%
mutate(lagDelayDayPlus = lag(delay_minutes, 1)) %>%
ungroup() %>%
mutate(actualTime = ymd_hms(actual_time),
scheduledTime = ymd_hms(scheduled_time),
date = ymd(date)) %>%
rename(delay = delay_minutes, train = train_id)
write.csv(trains, 'trains.csv')
lagVarScatter <- trains%>%
dplyr::select(delay,lagDelay,stationDelay,lag1StationDelay,lag2StationDelay,lagDelayDayPlus)%>%
gather(Variable, Value, -delay) %>%
mutate(Variable = fct_relevel(Variable,"Realtime Lag Delay","Average Station Delay","LagOneStationDelay","LagTwoStationDelay","LagDelayDayPlus"))
correlation.lag <-
group_by(lagVarScatter, Variable) %>%
summarize(correlation = round(cor(Value, delay,
use = "complete.obs"), 2))
ggplot(lagVarScatter%>% sample_n(30000), aes(Value, delay)) +
geom_point(size = .1, color = palette1_main) +
geom_text(data = correlation.lag,aes(label = paste("r =", round(correlation, 2))),x=-Inf, y=Inf, vjust = 2, hjust = -.5,size=4) +
geom_smooth(method = "lm", se = FALSE, colour = palette1_assist,size=1) +
facet_wrap(~Variable, ncol = 3, scales = "free") +
labs(
x="Lag Trip Count",y="Delay",
title = "Rideshare trip count as a function of time lags") +
plotTheme()
This line’s delay lagDelay, This station’s delaystationDelay, Previous station’s delaylag1StationDelay, Previous of previous station’s delaylag2StationDelay, and previous delay of same train at same stationlagDelayDayPlus are modeled into the model. In the following, Weather data is also added into the model.
# Weather
network = riem_networks()
weatherStations = riem_stations(network = 'NJ_ASOS') %>%
st_as_sf(coords = c('lon', 'lat'), crs = 4326, agr = 'constant') %>%
st_transform(crs)
weatherStations$weatherStationIndex = row.names(weatherStations) %>% as.numeric()
njStations$weatherStationIndex =
get.knnx(
st_coordinates(weatherStations),
st_coordinates(njStations), 1)$nn.index
# Determine which weather station for which transit station
njStations = njStations %>%
left_join(st_drop_geometry(weatherStations) %>%
dplyr::select(weatherStationID = id, weatherStationIndex),
by = 'weatherStationIndex')
# Add weatherStationID
trains = trains %>%
filter(., to != 'secaucus concourse') %>%
left_join(njStations, by = c('to' = 'station')) %>%
st_sf() %>%
dplyr::select(-joined, -weatherStationIndex)
# Get weather Data
weatherStationList = unique(trains$weatherStationID)
weather = data.frame()
for(i in weatherStationList){
weatherSeg = riem_measures(station = i,
date_start = '2019-09-01',
date_end = '2019-10-31') %>%
mutate_if(is.character, list(~replace(as.character(.), is.na(.), "0"))) %>%
replace(is.na(.), 0) %>%
mutate(hour = ymd_h(substr(valid, 1, 13))) %>%
mutate(week = week(hour),
dayOfWeek = wday(hour, label = T)) %>%
group_by(hour) %>%
summarize(temp = max(tmpf),
precip = sum(p01i),
windSpeed = max(sknt)) %>%
mutate(temp = ifelse(temp == 0, 42, temp)) %>%
mutate(precip = ifelse(precip > 0, precip, 0)) %>%
mutate(weatherStationID = i)
weather = rbind(weather, weatherSeg)
}
# Join Data
trains$timeHour = floor_date(trains$scheduled_time, unit = 'hour')
trains = trains %>%
left_join(weather,
by = c('weatherStationID' = 'weatherStationID',
'timeHour' = 'hour'))%>%
st_sf()
weatherVarScatter <- trains%>%
st_drop_geometry()%>%
dplyr::select(delay,temp,precip,windSpeed)%>%
gather(Variable, Value, -delay)
correlation.lag <-
group_by(weatherVarScatter, Variable) %>%
summarize(correlation = round(cor(Value, delay,
use = "complete.obs"), 2))
ggplot(weatherVarScatter%>% sample_n(30000), aes(Value, delay)) +
geom_point(size = .1, color = palette1_main) +
geom_text(data = correlation.lag,aes(label = paste("r =", round(correlation, 2))),x=-Inf, y=Inf, vjust = 2, hjust = -.5,size=4) +
geom_smooth(method = "lm", se = FALSE, colour = palette1_assist,size=1) +
facet_wrap(~Variable, ncol = 3, scales = "free") +
labs(
x="",y="Delay",
title = "Time Delay of Weather") +
plotTheme()
Besides, fixed effect indicator, Distance to the first staion of the train, and direction of uptown and downtown are also included in the model.
### Add time fixed effect
trains2 = trains %>%
mutate(hour = hour(scheduled_time),
dayOfWeek = wday(scheduled_time),
week = week(scheduled_time))%>%
filter(week %in% c(37:43))%>%
mutate(legend = case_when(
week %in% c(37:41) ~ "train",
TRUE ~ "test"))
stations = trains2 %>%dplyr::select(station = to, geometry) %>%unique()
### Distance to the first staion of the train
trainIDAndfirstStations = trains2 %>%
filter(., stop_sequence == 1) %>%
dplyr::select(firstStation = to, train) %>%
unique()
firstStations = trainIDAndfirstStations %>%
dplyr::select(firstStation) %>%
unique()
distanceMatrix =
expand.grid(firstStation = firstStations$firstStation,
njStation = stations$station)
for(i in 1:nrow(distanceMatrix)){
thisFirstStation = distanceMatrix[i, 1]
thisStation = distanceMatrix[i, 2]
distance = st_distance(
firstStations %>% filter(., firstStation == thisFirstStation),
stations %>% filter(., station == thisStation)
) %>% as.numeric()
distanceMatrix[i, 'distance'] = distance
if(i %% 100 == 0){
print(i)
}
}
trainIDAndStations = trains2 %>%
dplyr::select(station = to, train) %>%
unique() %>%
left_join(trainIDAndfirstStations %>% st_drop_geometry,
by = 'train') %>%
left_join(distanceMatrix, by = c('firstStation' = 'firstStation',
'station' = 'njStation'))
trains3 = trains2 %>%
left_join(st_drop_geometry(trainIDAndStations) %>%
dplyr::select(-firstStation),
by = c('to' = 'station',
'train' = 'train')) %>%
rename(distFirstStation = distance)
### Uptown and Downtown
NYPennStation = stations %>%
filter(., station == 'new york penn')
stations = stations %>%
mutate(distNYPenn =
st_distance(., NYPennStation) %>% as.numeric())
trains4 = trains3 %>%
left_join(st_drop_geometry(stations) %>%
rename(lastStationDistNYPenn = distNYPenn),
by = c('from' = 'station')) %>%
left_join(st_drop_geometry(stations) %>%
rename(thisStationDistNYPenn = distNYPenn),
by = c('to' = 'station')) %>%
mutate(direction =
ifelse(lastStationDistNYPenn >= thisStationDistNYPenn,
'downtown', 'uptown'))
trains4 = trains4 %>%
mutate(lagDelay = tidyr::replace_na(lagDelay, 0),
stationDelay = tidyr::replace_na(stationDelay, 0),
lag1StationDelay = tidyr::replace_na(lag1StationDelay, 0),
lag2StationDelay = tidyr::replace_na(lag2StationDelay, 0),
lagDelayDayPlus = tidyr::replace_na(lagDelayDayPlus, 0),
dayOfWeek = as.character(dayOfWeek),
hour = as.character(hour))
correlation.lag <-
trainsFinal %>%
dplyr::select(delay,distFirstStation)%>%
summarize(correlation = round(cor(distFirstStation, delay,
use = "complete.obs"), 2))
grid.arrange(ncol=2,
ggplot(trainsFinal%>% sample_n(30000), aes(distFirstStation, delay)) +
geom_point(size = .1, color = palette1_main) +
geom_text(data = correlation.lag,aes(label = paste("r =", round(correlation, 2))),x=-Inf, y=Inf, vjust = 2, hjust = -.5,size=4) +
geom_smooth(method = "lm", se = FALSE, colour = palette1_assist,size=1) +
labs(
x="Distance",y="Delay",
title = "",
subtitle = "") +
plotTheme(),
trainsFinal %>%
na.omit()%>%
dplyr::select(delay,direction) %>%
ggplot(aes(direction,delay)) +
geom_bar(position ="dodge", stat ="summary", fun ="mean",fill = palette1_main) +
labs(x="Direction", y="Mean",
title ="",
subtitle = "") +
plotTheme() +
theme(legend.position = "none"))
Furthermore, ten tiles division are introduce into the model. We use trainsTrain mean delay to do so
trainsTrain = trainsFinal %>%
filter(., legend == 'train')
stationHourMean = trainsTrain %>%
group_by(to, hour) %>%
summarize(meanDelay = mean(delay, na.rm = T)) %>%
mutate(stationHour10Tile = ntile(meanDelay, 10) %>% as.character()) %>%
st_drop_geometry() %>%
mutate(hour = as.character(hour))
trainsTrain = trainsTrain %>%
left_join(stationHourMean %>% dplyr::select(-meanDelay),
by = c('to' = 'to',
'hour' = 'hour'))
trainsTest = trainsFinal %>%
filter(., legend == 'test') %>%
left_join(stationHourMean %>% dplyr::select(-meanDelay),
by = c('to' = 'to',
'hour' = 'hour'))
HERE SHOULD ADD a ten tile kable to see the mean delay of each tile
# Baseline
reg2hBaseline = lm(delay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip + stationHour10Tile + distFirstStation + direction,
data = trainsTrain)
trainsTestBaseLine = trainsTest %>%
mutate(predict2hBaseline = predict(reg2hBaseline, newdata = trainsTest),
absError2hBaseline = abs(delay - predict2hBaseline))%>%
st_drop_geometry()%>%
dplyr::select(absError2hBaseline,line,delay,predict2hBaseline)
MAEForBaseline <- mean(trainsTestBaseLine$absError2hBaseline, na.rm = T)%>%as.data.frame()
trainsTestBaseLine%>%sample_n(30000)%>%
ggplot() +
geom_point(aes(x = delay, y = predict2hBaseline), size = 0.1,color = palette1_main) +
geom_text(data = MAEForBaseline,aes(label = paste("MAE =", round(., 2))),x=-Inf, y=Inf, vjust = 2, hjust = -.5,size=4) +
geom_abline(slope = 1, intercept = 0, colour = palette1_assist,size=1)+
labs(
x="Actual Delay",y="Predicted Delay",
title = "",
subtitle = "") +
plotTheme()
trainsTestBaseLine = trainsTestBaseLine %>%
mutate(tile = case_when(delay < 20 ~ "<20 min",
delay < 40 ~ "20~40 min",
delay < 60 ~ "40~60 min",
delay < 80 ~ "60~80 min",
delay <= 100 ~ "80~100 min",
TRUE ~ ">100 min") %>%
factor(, levels = c("<20 min", "20~40 min",
"40~60 min", "60~80 min",
"80~100 min", ">100 min")))
byTile =
trainsTestBaseLine %>%
group_by(tile) %>%
summarize(meanObs = mean(delay, na.rm = T),
meanPredLong = mean(predict2hBaseline, na.rm = T)) %>%
gather(variable, value, -tile) %>%
mutate(variable = recode(variable,
"meanObs" = "Observed",
"meanPred" = "Predicted"))
byTile %>%
ggplot(aes(tile, value, shape = variable)) +
geom_point(size = 2,color=palette1_main) +
geom_path(aes(group = tile), color = palette1_assist) +
scale_shape_manual(values = c(2, 17), name = "") +
labs(x = 'Actual mean delay', y = 'Predicted mean delay') +
plotTheme() +
theme(legend.position = "bottom")
theme(axis.text.x = element_text(angle = 90, size = 20, vjust = 0.5, hjust = 1))
byLine0 =
trainsTestBaseLine %>%
group_by(line) %>%
summarize(MAE2h = mean(absError2hBaseline, na.rm = T))
byLine = lines %>%
left_join(byLine0, by = 'line')%>%
dplyr::select(line, MAE2h)
byLine %>%
ggplot() +
geom_sf(data = st_union(njCounties),
color='black', size=1.5, fill = NA)+
geom_sf(data = njCounties,
color= 'black', size=0.5,linetype = 'dashed', fill = NA) +
geom_sf(aes(color = MAE2h), size = 1.5) +
scale_color_gradient(high = brightRed, low = darkBlue, name = '') +
mapTheme()+
theme(legend.position = "bottom",
legend.key.width = unit(2, 'cm'),
legend.key.height = unit(1, 'cm'))
trainsTrain <- trainsTrain%>%filter(line!="Atl. City Line")
trainsTest <- trainsTrain%>%filter(line!="Atl. City Line")
# Baseline
reg2hBaseline = lm(delay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip + stationHour10Tile + distFirstStation + direction,
data = trainsTrain)
trainsTest = trainsTest %>%
mutate(predict2hBaseline = predict(reg2hBaseline, newdata = trainsTest),
absError2hBaseline = abs(delay - predict2hBaseline))
# log the dependent (not good)
trainsTrain = trainsTrain %>%mutate(logDelay = log(delay + 1))
reg2hLnLog = lm(logDelay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip+ stationHour10Tile + distFirstStation + direction,
data = trainsTrain)
trainsTest = trainsTest %>%
mutate(logPredict2hLog = predict(reg2hLnLog, newdata = trainsTest),
predict2hLog = exp(logPredict2hLog) - 1,
absError2hLog = abs(delay - predict2hLog))
# sq the dependent (better)
trainsTrain = trainsTrain %>%mutate(sqDelay = sqrt(delay))
reg2hLnSq = lm(sqDelay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip+ stationHour10Tile + distFirstStation + direction,
data = trainsTrain)
trainsTest = trainsTest %>%
mutate(sqPredict2hSq = predict(reg2hLnSq, newdata = trainsTest),
predict2hSq = sqPredict2hSq ^ 2,
absError2hSq = abs(delay - predict2hSq))
# sq the dependent, and some independents squared
reg2hLnSqSSq =lm(sqDelay ~
stop_sequence + to + line + lagDelay + I(lagDelay ^ 2) + stationDelay +
I(stationDelay ^ 2) + lag1StationDelay +
lag2StationDelay + lagDelayDayPlus +hour + dayOfWeek + precip + stationHour10Tile +
direction + distFirstStation,
data = trainsTrain)
trainsTest = trainsTest %>%
mutate(sqPredict2hSSqSq = predict(reg2hLnSqSSq, newdata = trainsTest),
predict2hSqSSq = sqPredict2hSSqSq ^ 2,
absError2hSqSSq = abs(delay - predict2hSqSSq))
ErrComparison <- trainsTest%>%
st_drop_geometry()%>%
dplyr::select(delay,starts_with("absError"))%>%
summarise(Baseline=mean(absError2hBaseline,na.rm=T),
LogTrans=mean(absError2hLog,na.rm=T),
SqrtDep=mean(absError2hSq,na.rm=T),
SqrtDepIndep=mean(absError2hSqSSq,na.rm=T))
trainsTest%>%
dplyr::select(delay,starts_with("absError"))%>%
st_drop_geometry()%>%
gather(Variable, Value, -delay)%>%
filter(Variable != 'absError2hSeg')%>%
mutate(Variable=recode(Variable,
"absError2hBaseline" = 'BaseLine',
"absError2hLog" = "Log DEP Var",
"absError2hSq" = 'Sqrt DEP Var',
"absError2hSqSSq" = 'Sqrt DEP & IND Var')%>%
factor(., levels = c('BaseLine','Log DEP Var','Sqrt DEP Var','Sqrt DEP & IND Var')))%>%
ggplot() +
geom_point(aes(x = delay, y = Value),size=.1,color=palette1_main) +
facet_wrap(~Variable,scales = "free",ncol=2)+
labs(x="Delay",y="Error",title = "")+
plotTheme()
ErrComparison = ErrComparison%>%rename('Log Transform'='LogTrans','Sqrt DEP Var'='SqrtDep','Sqrt DEP & IND Vars'='SqrtDepIndep')
kable(ErrComparison,align = 'c',caption = '<center>Table.A </center>',table.attr = "style='width:75%;'")%>%
kable_classic(full_width = T)%>%
kable_styling(position = "center")%>%
row_spec(0, bold = T,color = palette1_main)
| Baseline | Log Transform | Sqrt DEP Var | Sqrt DEP & IND Vars |
|---|---|---|---|
| 2.772074 | 2.67858 | 2.635322 | 2.635088 |
ggplot(trainsTrain) +
geom_histogram(aes(delay), bins = 50,fill=palette1_main)+
labs(title = '',x='Delay',y='Count') +
plotTheme()
Since the outcome distribution is heavily skewed. A logit transformation should work well here.
trainsTrain = trainsTrain %>%
mutate(late = ifelse(delay > 5, 1, 0))
trainsTest = trainsTest %>%
mutate(late = ifelse(delay > 5, 1, 0))
lateOrNot = glm(late ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip + stationHour10Tile + distFirstStation + direction,
data = trainsTrain,
family="binomial" (link="logit"))
lateProbs = data.frame(
outcome = as.factor(trainsTest$late),
probs = predict(lateOrNot, trainsTest, type = 'response'))
ggplot(lateProbs, aes(x = probs, fill = as.factor(outcome))) +
geom_density(color=FALSE) +
facet_grid(outcome ~.) +
scale_fill_manual(values = palette2,name = "Delay or Not") +
xlim(0, 1) +
plotTheme()+
labs(title = '',x='Probability',y='Density')+
theme(legend.position = "bottom")
But from the drawing, we can see the density diagram is heavily overlapped. That means the logistic regression may not be suitable here. Therefore we still focuses on the baseline model. From the plots, we can find out that the prediction error mainly exist in the small delay intervals. So the following section will try to focus on the small delays.
reg2hLnSqSMALLDELAYS = lm(sqDelay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip + stationHour10Tile + distFirstStation + direction,
data = trainsTrain %>% filter(., delay <= 30))
trainsTest = trainsTest %>%
mutate(sqPredict2hSqSMALLDELAYS =
predict(reg2hLnSqSMALLDELAYS, newdata = trainsTest),
predict2hSqSMALLDELAYS = sqPredict2hSqSMALLDELAYS ^ 2)
trainsTest = trainsTest %>%
mutate(predict2hSelectionModel =
ifelse(predict2hSq <= 20,
predict2hSqSMALLDELAYS,
predict2hSq),
absError2hSeg = abs(delay - predict2hSelectionModel))
correlationForSmall <- mean(trainsTest$absError2hSeg, na.rm = T)%>%as.data.frame()
ggplot(trainsTest) +
geom_point(aes(x = delay, y = predict2hSelectionModel), size = 0.1,color = palette1_main) +
geom_text(data = correlationForSmall,aes(label = paste("MAE =", round(., 2))),x=-Inf, y=Inf, vjust = 2, hjust = -.5,size=4) +
geom_abline(slope = 1, intercept = 0, colour = palette1_assist,size=1)+
labs(
x="Actual Delay",y="Predicted Delay",
title = "",
subtitle = "") +
plotTheme()
Following is the CV for the dataset.
trainsL = trainsFinal %>%
filter(line!="Atl. City Line")%>%
mutate(sqDelay = sqrt(delay)) %>%
left_join(stationHourMean %>% dplyr::select(-meanDelay),
by = c('to' = 'to',
'hour' = 'hour'))
dateList = unique(trainsL$date)
MAE2hList = list()
MAE30mList = list()
dayNoList = list()
for (dayNo in dateList){
gc()
dayNoList = append(dayNoList, dayNo)
cat('This hold-out test day is', dayNo, '\n')
daysTrainL = trainsL %>% filter(., date != dayNo) %>% as.data.frame()
daysTestL = trainsL %>% filter(., date == dayNo) %>% as.data.frame()
reg2h = lm(sqDelay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip + stationHour10Tile + direction,
data = daysTrainL)
reg2hSmall = lm(sqDelay ~
stop_sequence + to + line + lagDelay + stationDelay +
lag1StationDelay + lag2StationDelay + lagDelayDayPlus +
hour + dayOfWeek + precip + stationHour10Tile + direction,
data = daysTrainL %>% filter(., delay <= 30))
testCompareL = data.frame(observation = daysTestL$delay,
sqPrediction = predict(reg2h, newdata = daysTestL),
sqPredictionSmall =predict(reg2hSmall, newdata = daysTestL)) %>%
mutate(prediction = sqPrediction ^ 2,
predictionSmall = sqPredictionSmall ^ 2,
prediction = ifelse(
prediction < 20,
predictionSmall,
prediction
)) %>%
mutate(absError = abs(observation - prediction))
MAE2h = mean(testCompareL$absError, na.rm = T)
MAE2hList = append(MAE2hList, MAE2h)
cat('The 2h-MAE is', MAE2h, '\n')
}
CVResult = cbind(MAE2hList)%>%
as.data.frame() %>%
mutate(MAE2h = MAE2hList %>% as.numeric()) %>%
dplyr::select(-MAE2hList) %>%
cbind(dateList) %>%
as.data.frame() %>%
rename(date = dateList)
st_write(CVResult,"CVResult.csv")
CVResult <- read.csv("CVResult.csv")
CVResult <- CVResult%>%mutate(date = ymd(date))%>%na.omit()%>%transform(CVResult, MAE2h = as.numeric(MAE2h))
ggplot(data=CVResult,aes(date, MAE2h)) +
geom_point(size = 2,color=palette1_main) +
geom_line(data=CVResult,aes(date, MAE2h),color=palette1_assist,line=2) +
labs(x = 'Day', y = 'MAE') +
plotTheme()
From the outcome, we can see that Higher error with higher delays on weekends, 30-minute-ahead prediction not much better than 2-hour-ahead.
finalComparison = trainsTest %>%
mutate(obs= delay,
predLong = predict2hSelectionModel,
absErrLong = absError2hSeg,) %>%
dplyr::select(date, train, station = to, line, GEOID,
week, hour, dayOfWeek, obs, predLong, absErrLong)
byStationByWeek =
finalComparison %>%
group_by(station, week) %>%
summarize(MAE2h = mean(absErrLong, na.rm = T)) %>%
mutate(week = recode(week, '42' = 'Week 42', '43' = 'Week 43'))
byStationByWeek %>%
ggplot() +
geom_sf(data = st_union(njCounties),
color='black', size=1.5, fill = NA)+
geom_sf(data = njCounties,
color= 'black', size=0.5,linetype = 'dashed', fill = NA) +
geom_sf(aes(color = MAE2h), size = 0.6) +
scale_color_gradient(high = brightRed, low = darkBlue) +
facet_wrap(~week) +
mapTheme() +
theme(legend.position = "bottom",panel.spacing = unit(6, 'lines'))
byStationByDayOfWeek =
finalComparison %>%
group_by(station, dayOfWeek) %>%
summarize(MAE2h = mean(absErrLong, na.rm = T)) %>%
mutate(dayOfWeek = recode(dayOfWeek,
'3' = 'Mon',
'4' = 'Tue',
'5' = 'Wed',
'6' = 'Thu',
'7' = 'Fri',
'1' = 'Sat',
'2' = 'Sun') %>%
factor(., levels = c('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun')))
byStationByDayOfWeekAnimation =
byStationByDayOfWeek %>%
mutate(MAE2h = ifelse(MAE2h > 5, 5, MAE2h)) %>%
ggplot() +
geom_sf(data = st_union(njCounties),
color='black', size=4, fill = NA)+
geom_sf(data = njCounties,
color= 'black', size=2,linetype = 'dashed', fill = NA) +
geom_sf(aes(color = MAE2h), size = 6) +
scale_color_gradient(high = brightRed, low = darkBlue,
name = '',
breaks = c(0, 1, 2, 3, 4, 5),
labels = c("0", "1", "2", "3", "4", "5+")) +
labs(title = '{current_frame}') +
transition_manual(dayOfWeek) +
mapTheme(36,40) +
theme(legend.position = "bottom",
panel.spacing = unit(12, 'lines'),
legend.title = element_text(family = 'font', colour = "black", face = "italic", size = 36),
legend.text = element_text(family = 'font', colour = "black", face = "italic", size = 34),
legend.key.width = unit(2, 'cm'),
legend.key.height = unit(1, 'cm'))
anim_save("byStationByDayOfWeekAnimation.gif", byStationByDayOfWeekAnimation,
duration=10, renderer = gifski_renderer() ,height = 2000, width =2000)
byStationByHour =
finalComparison %>%
mutate(hour = as.numeric(hour)) %>%
group_by(station, hour) %>%
summarize(MAE2h = mean(absErrLong, na.rm = T))
byStationByHourAnimation =
byStationByHour %>%
mutate(MAE2h = ifelse(MAE2h > 5, 5, MAE2h)) %>%
ggplot() +
geom_sf(data = st_union(njCounties),
color='black', size=4, fill = NA)+
geom_sf(data = njCounties,
color= 'black', size=2,linetype = 'dashed', fill = NA) +
geom_sf(aes(color = MAE2h), size = 6) +
scale_color_gradient(high = brightRed, low = darkBlue,
name = '',
breaks = c(0, 1, 2, 3, 4, 5),
labels = c("0", "1", "2", "3", "4", "5+")) +
labs(title = '{current_frame}:00 ~ {current_frame}:59') +
transition_manual(hour) +
mapTheme(36,40) +
theme(legend.position = "bottom",
panel.spacing = unit(12, 'lines'),
legend.title = element_text(family = 'font', colour = "black", face = "italic", size = 36),
legend.text = element_text(family = 'font', colour = "black", face = "italic", size = 34),
legend.key.width = unit(2, 'cm'),
legend.key.height = unit(1, 'cm'))
anim_save("byStationByHourAnimation.gif", byStationByHourAnimation,
duration=10, renderer = gifski_renderer(), height = 2000, width = 2000)
byLine0 =
finalComparison %>%
st_drop_geometry %>%
group_by(line) %>%
summarize(MAE2h = mean(absErrLong, na.rm = T))
byLine = lines %>%
left_join(byLine0, by = 'line')%>%
dplyr::select(line, MAE2h)
byLine %>%
ggplot() +
geom_sf(data = st_union(njCounties),
color='black', size=1.5, fill = NA)+
geom_sf(data = njCounties,
color= 'black', size=0.5,linetype = 'dashed', fill = NA) +
geom_sf(aes(color = MAE2h), size = 1.5) +
scale_color_gradient(high = brightRed, low = darkBlue, name = '') +
mapTheme()+
theme(legend.position = "bottom",
legend.key.width = unit(2, 'cm'),
legend.key.height = unit(1, 'cm'))
finalComparison = finalComparison %>%
mutate(tile = case_when(obs < 20 ~ "<20 min",
obs < 40 ~ "20~40 min",
obs < 60 ~ "40~60 min",
obs < 80 ~ "60~80 min",
obs <= 100 ~ "80~100 min",
TRUE ~ ">100 min") %>%
factor(, levels = c("<20 min", "20~40 min",
"40~60 min", "60~80 min",
"80~100 min", ">100 min")))
byTile =
finalComparison %>%
group_by(tile) %>%
summarize(meanObs = mean(obs, na.rm = T),
meanPredLong = mean(predLong, na.rm = T)) %>%
st_drop_geometry() %>%
gather(variable, value, -tile) %>%
mutate(variable = recode(variable,
"meanObs" = "Observed",
"meanPred" = "Predicted"))
byTile %>%
ggplot(aes(tile, value, shape = variable)) +
geom_point(size = 2,color=palette1_main) +
geom_path(aes(group = tile), color = palette1_assist) +
scale_shape_manual(values = c(2, 17), name = "") +
labs(x = 'Actual mean delay', y = 'Predicted mean delay') +
plotTheme() +
theme(legend.position = "bottom")
theme(axis.text.x = element_text(angle = 90, size = 20, vjust = 0.5, hjust = 1))